pacman::p_load(rstudioapi, tidyverse, data.table, broom)
rm(list=ls())
################################################################################

#################################### World commodity prices

df = fread("../../data/prices/raw/PSOYBUSDQ.csv")[, year:= as.integer(format(as.Date(DATE, format="%Y/%m/%d"),"%Y"))]
df_1 = df[, .(soybean=mean(PSOYBUSDQ)), by = .(year)]
df = fread("../../data/prices/raw/PMAIZMTUSDM.csv")[, year:= as.integer(format(as.Date(DATE, format="%Y/%m/%d"),"%Y"))]
df_2 = df[, .(maize=mean(PMAIZMTUSDM)), by = .(year)]
df = fread("../../data/prices/raw/PSUNOUSDM.csv")[, year:= as.integer(format(as.Date(DATE, format="%Y/%m/%d"),"%Y"))]
df_3 = df[, .(sunflower=mean(PSUNOUSDM)), by = .(year)]
df = fread("../../data/prices/raw/PWHEAMTUSDM.csv")[, year:= as.integer(format(as.Date(DATE, format="%Y/%m/%d"),"%Y"))]
df_4 = df[, .(wheat=mean(PWHEAMTUSDM)), by = .(year)]
df = fread("../../data/prices/raw/PRICENPQUSDQ.csv")[, year:= as.integer(format(as.Date(DATE, format="%Y/%m/%d"),"%Y"))]
df_5 = df[, .(rice=mean(PRICENPQUSDQ)), by = .(year)]
df = fread("../../data/prices/raw/PSUGAISAUSDQ.csv")[, year:= as.integer(format(as.Date(DATE, format="%Y/%m/%d"),"%Y"))]
df_6 = df[, .(sugar=mean(PSUGAISAUSDQ)*2204.62/100 ), by = .(year)] #convert from cents per lb to dollars per metric ton

df = merge(df_1, df_2, by=c('year'))
df = merge(df, df_3, by=c('year'))
df = merge(df, df_4, by=c('year'))
df = merge(df, df_5, by=c('year'))
df = merge(df, df_6, by=c('year'))

commodity_list = c("soybean","maize","wheat","rice","sugar","sunflower")
df_w = melt(df, idvar = c("year"), measure.vars = commodity_list, value.name = "mean", variable.name = "commodity")
df_w$commodity = as.character(df_w$commodity)


#################################### Brazil

#Currency conversion
df = fread("../../data/prices/raw/CCUSMA02BRM618N.csv")[, year:= as.integer(format(as.Date(date, format="%Y/%m/%d"),"%Y"))]
df_cu = df[, .(real_usd_rate=mean(real_usd_rate)), by = .(year)]

#Geographic units
for(unit in c('county_id', 'amc_id', 'microregion_id', 'mesoregion_id', 'state_id', 'region')){
  
  df_un = fread("../../data/landuse/clean/geographicunits_brazil.csv.gz")[, list(county_id, amc_id, microregion_id, mesoregion_id, state_id, region, country)]
  df_un$spatial_unit = unit
  if (unit=='county_id'){
    df_un$spatial_id = df_un$county_id 
    unit_rm_list = c('county_id')}
  if (unit=='amc_id'){
    df_un$spatial_id = df_un$amc_id 
    unit_rm_list = c('county_id','amc_id')}
  if (unit=='microregion_id'){
    df_un$spatial_id = df_un$microregion_id 
    unit_rm_list = c('county_id','amc_id','microregion_id')}
  if (unit=='mesoregion_id'){
    df_un$spatial_id = df_un$mesoregion_id 
    unit_rm_list = c('county_id','amc_id','microregion_id','mesoregion_id')}
  if (unit=='state_id'){
    df_un$spatial_id = df_un$state_id 
    unit_rm_list = c('county_id','amc_id','microregion_id','mesoregion_id','state_id')}
  if (unit=='region'){
    df_un$spatial_id = df_un$region 
    unit_rm_list = c('county_id','amc_id','microregion_id','mesoregion_id','state_id','region')}
  
  #Crop output and value
  df = fread("../../data/landuse/clean/cropland_brazil_county.csv.gz")
  setnames(df, old = c('crop'),  new = c('commodity'))
  df_cr = df[commodity %in% c('soybean','maize','wheat','rice','sugarcane','sunflower','beans','coffee'),]
  df_cr = reshape(df_cr, idvar = c("year", "county_id","commodity"), timevar = "variable", direction = "wide")
  setnames(df_cr, old = c('value.value_r','value.yield_tha','value.production_t','value.area_planted_ha'),  new = c('value_r','yield_tha','production_t','area_planted_ha'))
  df_cr = merge(df_un, df_cr, by=c('county_id'))
  df_cr = merge(df_cr, df_cu, by=c('year'))
  df_cr[, value_usd:=value_r/real_usd_rate]
  
  df_cr_u = df_cr[, .(production_t=sum(production_t), value_usd=sum(value_usd)), by = .(year, spatial_id, commodity)]
  df_cr_u = df_cr_u[production_t>0 & value_usd>0,][, c('price') := .(value_usd/production_t), ]
  df_outliers = df_cr_u[, .(lq=quantile(price,0.25), uq=quantile(price,0.75), iqr = IQR(price), llq = quantile(price,0.025), uuq = quantile(price,0.975) ), by = .(year, commodity)]
  df_cr_u = merge(df_cr_u, df_outliers, by=c('year','commodity'))
  df_cr_u = df_cr_u[price > llq & price < uuq,]
  df_cr_u = df_cr_u[price > lq-iqr*1.5 & price < uq+iqr*1.5,]
  
  #Save final data
  df_1 = df_un[!duplicated(df_un$spatial_id), ][,(unit_rm_list):= NULL]
  df_2 = df_cr_u[year>=1995 & commodity %in% c("soybean","maize","wheat","rice","sugarcane"),list(year, spatial_id, commodity, price)]
  df = merge(df_1, df_2, by=c('spatial_id'))[order(commodity, year, spatial_id),]
  write.csv(df, gzfile(paste0("../../data/prices/clean/farmgate_crops_brazil_",unit,".csv.gz")), row.names = FALSE)
  
}
